measures_of_interest %>%
ggplot(aes(x = date, y = count)) +
geom_line(aes(group = measure, color = measure)) +
geom_point(aes(group = seq_along(date), color = measure)) +
transition_reveal(date) +
scale_x_date(breaks = "4 months", labels = date_format("%b-%Y")) +
scale_colour_discrete(name = "Membership Type",
breaks = c("ct_annual_members", "ct_single_day_passes",
"ct_single_trip_passes"),
labels = c("Annual Passes Renewed or Purchased",
"Single-Day Passes Purchased",
"Single-Trip Passes Purchased")) +
labs(x = "Date", y = "Count") +
ggtitle("CitiBike Memberships Purchased by Month in NYC and New Jersey City") +
theme_minimal() +
theme(legend.position = "bottom") +
guides(color = guide_legend(nrow = 2, byrow = TRUE)) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

manhattan_rides_df <- read_csv("manhattan_rides.csv")
manhattan_rides_df <-
manhattan_rides_df %>%
mutate(
day_of_week = factor(day_of_week, ordered = T,
levels = c("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat")),
year = factor(year),
age_group = factor(age_group, ordered = T,
levels = c("18-25","26-35", "36-45", "46-55", "56-65", "66-85")),
gender = type.convert(gender, as.is = F))
manhattan_rides_df %>%
group_by(age_group) %>%
summarize(min = min(age), max = max(age), obs = n())
## # A tibble: 6 × 4
## age_group min max obs
## <ord> <dbl> <dbl> <int>
## 1 18-25 18 25 35068
## 2 26-35 26 35 102948
## 3 36-45 36 45 56694
## 4 46-55 46 55 43430
## 5 56-65 56 65 26232
## 6 66-85 66 85 6734
manhattan_rides_df %>%
group_by(year) %>%
mutate(
month = month(starttime, label = T)
) %>%
filter(tripduration < 2500) %>%
plot_ly(
x = ~month,
y = ~trip_min,
color = ~year,
type = "box") %>%
layout(
boxmode = "group",
title = "Duration of Citibike Rides by Month",
xaxis = list(title = "Month"),
yaxis = list(title = "Trip Duration in Minutes")
)
Looks like maybe the overall length of trips in 2019 was more consistent. 2020 had a bump in duration of rides, starting in April. Overall, trip length seems more variable in 2020.
manhattan_rides_df %>%
group_by(year) %>%
mutate(
month = month(starttime, label = T)
) %>%
group_by(year, month) %>%
summarise(obs = n()) %>%
plot_ly(
x = ~month,
y = ~obs,
color = ~year,
type = "scatter",
mode = "lines") %>%
layout(
title = "Number of Citibike Rides per Month",
xaxis = list(title = "Month"),
yaxis = list(title = "Rides")
)
Huge drop in monthly trips in April 2020. Lockdown started mid/late March so this coincides with people transitioning to WFH and largely staying inside to minimize contacts. The ride numbers bounce back quite a bit after this but not to 2019 levels.
citi_pc_change =
manhattan_rides_df %>%
mutate(date = format(stoptime, format = "%m-%d-%Y")) %>%
group_by(stop_date, year) %>%
summarize(daily_rides = n()) %>%
ungroup() %>%
group_by(stop_date) %>%
arrange(year, .by_group = T) %>%
mutate(percent_change = (daily_rides/lag(daily_rides) - 1) * 100) %>%
filter(year == 2020) %>%
select(date = stop_date, percent_change) %>%
mutate(transit_system = "citi_bike",
date = paste0("2020-", date),
date = as.Date(date, "%Y-%m-%d"))
## `summarise()` has grouped output by 'stop_date'. You can override using the `.groups` argument.
# Using ggplot
post_covid_transit = citi_pc_change %>%
ungroup() %>%
drop_na() %>%
ggplot(aes(x = date, y = percent_change, color = transit_system)) +
geom_smooth(aes(color = transit_system))
ggplotly(post_covid_transit)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#Ridership data for 2019/2020 Manhattan
turnstiles_2019_m = read_csv("2019-turnstile.csv") %>%
filter(borough == "M") %>%
mutate(gtfs_latitude = as.numeric(gtfs_latitude),
gtfs_longitude = as.numeric(gtfs_longitude))
## Rows: 159384 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): stop_name, daytime_routes, division, line, borough, structure, gtf...
## dbl (2): entries, exits
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
turnstiles_2020_m = read_csv("2020-turnstile.csv") %>%
filter(borough == "M") %>%
mutate(gtfs_latitude = as.numeric(gtfs_latitude),
gtfs_longitude = as.numeric(gtfs_longitude))
## Rows: 159533 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): stop_name, daytime_routes, division, line, borough, structure, com...
## dbl (4): gtfs_longitude, gtfs_latitude, entries, exits
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
turnstiles_19_20_m = rbind(turnstiles_2019_m, turnstiles_2020_m)
## Warning: One or more parsing issues, see `problems()` for details
#Ridership data 3/1/2020 - today
ridership_covid_changes = read_csv("covid-ridership.csv") %>%
janitor::clean_names() %>%
mutate(date = as.Date(date, "%m/%d/%Y")) %>%
rename(buses_ter = buses_total_estimated_ridership) %>%
rename(lirr_ter = lirr_total_estimated_ridership) %>%
rename(metro_north_ter = metro_north_total_estimated_ridership) %>%
rename(subways_ter = subways_total_estimated_ridership) %>%
rename(subways_pc = subways_percent_change_from_pre_pandemic_equivalent_day) %>%
rename(metro_north_pc = metro_north_percent_change_from_2019_monthly_weekday_saturday_sunday_average) %>%
rename(lirr_pc = lirr_percent_change_from_2019_monthly_weekday_saturday_sunday_average) %>%
rename(buses_pc = buses_percent_change_from_pre_pandemic_equivalent_day) %>%
rename(bridges_and_tunnels_pc = bridges_and_tunnels_percent_change_from_pre_pandemic_equivalent_day) %>%
rename(access_a_ride_ter = access_a_ride_total_scheduled_trips) %>%
rename(access_a_ride_pc = access_a_ride_percent_change_from_pre_pandemic_equivalent_day)
## Rows: 628 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): Date, Subways: % Change From Pre-Pandemic Equivalent Day, Buses: % ...
## dbl (5): Subways: Total Estimated Ridership, Buses: Total Estimated Ridershi...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ridership_covid_changes_2020 = ridership_covid_changes %>%
filter(date <= as.Date('2020-12-31'))
ridership_covid_pc_tidy =
ridership_covid_changes_2020 %>%
select(date, access_a_ride_pc, bridges_and_tunnels_pc, buses_pc, lirr_pc, metro_north_pc, subways_pc) %>%
pivot_longer(
c(access_a_ride_pc:subways_pc),
names_to = "transit_system",
values_to = "percent_change"
) %>%
mutate(transit_system = gsub("_pc", "", transit_system),
percent_change = gsub("%", "", percent_change),
percent_change = as.numeric(percent_change))
ridership_pc_change =
bind_rows(ridership_covid_pc_tidy, citi_pc_change)
# Using ggplot
post_covid_transit = ridership_pc_change %>%
ungroup() %>%
drop_na() %>%
filter(date >= as.Date('2020-03-01')) %>%
ggplot(aes(x = date, y = percent_change, color = transit_system)) +
geom_smooth(aes(color = transit_system))
ggplotly(post_covid_transit)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# Using plotly
ridership_pc_change %>%
ungroup() %>%
drop_na() %>%
filter(date >= as.Date('2020-03-01')) %>%
plot_ly(
x = ~date,
y = ~percent_change,
color = ~transit_system,
type = "scatter",
mode = "lines") %>%
layout(
title = "Ridership Transit System Percent Change Following COVID",
xaxis = list(title = "Date"),
yaxis = list(title = "Percent Change")
)